20211009_10x_LY sc10x

only ILC2 selected
(without the upper left CC part)
(which could be mixed with other celltypes like stromal)

source("../../../analysis.10x.r")

subset ILC2

processed obj

GEX.seur.raw <- readRDS("../analysis_1101/sc10x.GEX.seur.pre1105.rds")
GEX.seur.raw
## An object of class Seurat 
## 16896 features across 18482 samples within 1 assay 
## Active assay: RNA (16896 features, 1500 variable features)
##  3 dimensional reductions calculated: pca, tsne, umap
DimPlot(GEX.seur.raw, reduction = "umap", group.by = "seurat_clusters",label = T, repel = F, label.size = 3.5) +
  DimPlot(GEX.seur.raw, reduction = "umap", group.by = "preAnno_2",label = T, repel = F, label.size = 3.2)

only cluster_1,3,4,5 selected

DimPlot(GEX.seur.raw, reduction = "umap", group.by = "seurat_clusters",label = T, repel = F, label.size = 3.5) +
  FeaturePlot(GEX.seur.raw, reduction = "umap", features = "percent.mt")

check some cutoffs

if could exclude those little group of ILC2 cells with high mito and low nFeature

DimPlot(GEX.seur.raw, reduction = "umap", group.by = "seurat_clusters",label = T, repel = F, label.size = 3.5) +
  FeaturePlot(subset(GEX.seur.raw, subset=percent.mt <7.5)
              , reduction = "umap", features = "percent.mt")

FeaturePlot(GEX.seur.raw, reduction = "umap", features = c("nFeature_RNA","nCount_RNA"))

FeaturePlot(subset(GEX.seur.raw, subset= nCount_RNA < 25000 & nFeature_RNA < 4500 & nFeature_RNA > 1000 & percent.mt < 7.5), 
            reduction = "umap", features = c("nFeature_RNA","nCount_RNA"))

only ILC2

GEX.seur <- subset(GEX.seur.raw, subset = seurat_clusters %in% c(1,3,4,5))
GEX.seur <- subset(GEX.seur, subset= nCount_RNA < 16000 & nFeature_RNA < 3600 & nFeature_RNA > 1000 & percent.mt < 7.5)
GEX.seur <- subset(GEX.seur, features = rownames(GEX.seur)[rowSums(GEX.seur@assays[['RNA']]@data > 0)>=3])
GEX.seur
## An object of class Seurat 
## 15083 features across 9076 samples within 1 assay 
## Active assay: RNA (15083 features, 1258 variable features)
##  3 dimensional reductions calculated: pca, tsne, umap
quantile(GEX.seur$nFeature_RNA,prob=seq(0,1,0.025))
##       0%     2.5%       5%     7.5%      10%    12.5%      15%    17.5% 
## 1001.000 1091.000 1159.000 1209.000 1244.000 1277.000 1307.000 1336.000 
##      20%    22.5%      25%    27.5%      30%    32.5%      35%    37.5% 
## 1362.000 1384.000 1405.000 1429.000 1453.000 1475.000 1497.250 1522.000 
##      40%    42.5%      45%    47.5%      50%    52.5%      55%    57.5% 
## 1543.000 1566.000 1586.000 1608.000 1631.000 1656.000 1677.250 1701.000 
##      60%    62.5%      65%    67.5%      70%    72.5%      75%    77.5% 
## 1726.000 1754.875 1784.000 1819.000 1851.000 1890.375 1932.250 1980.125 
##      80%    82.5%      85%    87.5%      90%    92.5%      95%    97.5% 
## 2028.000 2088.000 2151.750 2235.000 2329.500 2461.000 2618.250 2861.000 
##     100% 
## 3591.000
DimPlot(GEX.seur, reduction = "umap", group.by = "preAnno_1",label = F, repel = F, label.size = 3.5) +
  DimPlot(GEX.seur, reduction = "umap", group.by = "preAnno_2",label = F, repel = F, label.size = 3.2)

check the data

DimPlot(GEX.seur, reduction = "umap", group.by = "DoubletFinder0.1",label = F, repel = F, label.size = 3.5) +
  FeaturePlot(GEX.seur, reduction = "umap", features = "percent.mt")

plota <- FeatureScatter(GEX.seur, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "orig.ident") + 
             coord_cartesian(xlim =c(0, 30000), ylim = c(0, 10))
plotb <- FeatureScatter(GEX.seur, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "orig.ident") + 
             coord_cartesian(xlim =c(0, 30000), ylim = c(0, 5000))
plota + plotb

FeaturePlot(GEX.seur, reduction = "umap", features = c("nFeature_RNA","nCount_RNA"))

FeaturePlot(subset(GEX.seur, subset= nCount_RNA < 16000 & nFeature_RNA < 3200 & nFeature_RNA > 1000 & percent.mt < 7.5), 
            reduction = "umap", features = c("nFeature_RNA","nCount_RNA"))

VlnPlot(GEX.seur, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0.1, group.by = "SP.info")

VlnPlot(GEX.seur, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0.1, group.by = "FB.info")

DimPlot(GEX.seur, group.by = "FB.info", split.by = "FB.info", ncol = 4,
        cols = c("#05BE78","#5ECDF2","#739BDD","#B7E16F","#87C0C9","#7756FA","#9855B9","#6F9920"))

DimPlot(GEX.seur, group.by = "SP.info", split.by = "SP.info", ncol = 3)

genes_to_check <- c("Fcer1g","Klrb1c","Ncr1","Il22",
                    "Ccr6","Tbx21","Ifng","Eomes",
                    "Gata3","Il1rl1","Il13","Il17a",
                    "Il4","Il5","Klrg1","Rorc",
                    "Il7r","Thy1","Kdm6b","Ptprc",
                    "Cd3d","Cd3g","Cd4",
                    #"Cd8a",
                    "Trac","Trdc","Epcam","Pecam1","Siglech",
                    "Hist1h1b","Top2a","Mcm6","Mki67")

FeaturePlot(GEX.seur, features = genes_to_check, ncol = 4)

re-clustering

# remove unwanted metadata
GEX.seur@meta.data <- GEX.seur@meta.data[,c(1:8,12,14,15:17)]
head(GEX.seur@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt  FB.info
## AAACCCAAGCATCCTA-1   ILC2_GEX      13805         3251   3.940601 hashtag6
## AAACCCAAGGGCAACT-1   ILC2_GEX       5200         2039   3.518554 hashtag5
## AAACCCAAGGTGCCAA-1   ILC2_GEX       4848         1503   3.361518 hashtag1
## AAACCCACAGAAGTGC-1   ILC2_GEX       5665         1673   4.165931 hashtag3
## AAACCCAGTAGGGTAC-1   ILC2_GEX       3458         1574   1.098583 hashtag5
## AAACCCATCGGTCGAC-1   ILC2_GEX       3298         1169   4.093390 hashtag6
##                        S.Score  G2M.Score Phase DoubletFinder0.05
## AAACCCAAGCATCCTA-1 -0.08767960 -0.3449294    G1           Singlet
## AAACCCAAGGGCAACT-1 -0.04374438 -0.1981341    G1           Singlet
## AAACCCAAGGTGCCAA-1 -0.03750102 -0.1720957    G1           Singlet
## AAACCCACAGAAGTGC-1 -0.11390714 -0.2737016    G1           Singlet
## AAACCCAGTAGGGTAC-1 -0.03319338 -0.1208448    G1           Singlet
## AAACCCATCGGTCGAC-1 -0.05704899 -0.2030185    G1           Singlet
##                    DoubletFinder0.1 preAnno_1 preAnno_2  SP.info
## AAACCCAAGCATCCTA-1          Singlet      ILC2    ILC2_4 IL-33_DA
## AAACCCAAGGGCAACT-1          Singlet      ILC2    ILC2_3    IL-33
## AAACCCAAGGTGCCAA-1          Singlet      ILC2    ILC2_3      PBS
## AAACCCACAGAAGTGC-1          Singlet      ILC2    ILC2_1    IL-33
## AAACCCAGTAGGGTAC-1          Singlet      ILC2    ILC2_2    IL-33
## AAACCCATCGGTCGAC-1          Singlet      ILC2    ILC2_3 IL-33_DA
plota <- FeatureScatter(GEX.seur, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "orig.ident")  + 
             coord_cartesian(xlim =c(0, 20000), ylim = c(0, 10))
plotb <- FeatureScatter(GEX.seur, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "orig.ident") + 
             coord_cartesian(xlim =c(0, 20000), ylim = c(0, 4000))
plota + plotb

GEX.seur <- FindVariableFeatures(GEX.seur, selection.method = "vst", nfeatures = 800)

# Identify the 10 most highly variable genes
#top10 <- head(VariableFeatures(GEX.seur), 10)
top40 <- head(VariableFeatures(GEX.seur), 40)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(GEX.seur)
plot2 <- LabelPoints(plot = plot1, points = top40, repel = T)
plot1 + plot2

top40
##  [1] "Cxcl1"    "Il5"      "Cxcl2"    "Il13"     "Ptgs2"    "Cxcl10"  
##  [7] "Hspa1a"   "Ccl1"     "Areg"     "Cxcl3"    "Il17a"    "Rrad"    
## [13] "Serpine1" "Csf2"     "Mt1"      "Il1r2"    "Akap12"   "Tnfsf8"  
## [19] "Cd83"     "Il2"      "Gm10827"  "Hdc"      "Ccr7"     "Gm43305" 
## [25] "Gm12840"  "Penk"     "Bambi"    "Atf3"     "Calca"    "Hspa1b"  
## [31] "Ier3"     "Tmem80"   "Plac8"    "Klf2"     "Il17f"    "Terf2ip" 
## [37] "Il4"      "Tnfrsf4"  "Gem"      "AA467197"
GEX.seur <- ScaleData(GEX.seur, features = rownames(GEX.seur))
## Centering and scaling data matrix

PCA

# exclude MT genes  
MT_gene <- grep("^mt-",rownames(GEX.seur),value = T)
Ribo_gene <- grep("^Rpl|^Rps", rownames(GEX.seur),value = T)
GEX.seur <- RunPCA(GEX.seur, features = setdiff(VariableFeatures(object = GEX.seur),c(MT_gene,Ribo_gene)))
## PC_ 1 
## Positive:  Txnip, Gm42418, Tmsb4x, Sh3bgrl3, Mns1, S100a6, S100a4, S100a10, Lgals1, Cd69 
##     Vim, S100a11, Ahnak, Actn2, Capg, Cd3g, Cd52, Tnf, Crip1, Serpina3g 
##     Csf1, Cflar, Gm20627, Serpinb6a, Stc2, Malat1, Tsc22d3, Dleu2, Filip1l, Lgals3 
## Negative:  Kdm6b, Rora, Hilpda, Rel, Sub1, Samsn1, Nfkb1, Nr4a3, Junb, Srgn 
##     Areg, Pim1, Nabp1, Crem, Nfkbia, Ramp3, Rgs2, Odc1, Dot1l, Ifrd1 
##     Zfp36l1, Dennd4a, Mpp7, Nr4a1, Dusp5, Gata3, Il5, Cdkn1a, H3f3b, Isy1 
## PC_ 2 
## Positive:  Malat1, Lmo4, Neurl3, Gm20186, Fosb, Gm42418, Hes1, Vps37b, Calca, Nr4a1 
##     Satb1, Kctd12, Cebpb, Hs3st1, Prdx6, Zfp36, Zfp36l1, Junb, Tiparp, Neb 
##     Bmp2, Gimap6, Ccr9, Jund, Tcrg-C1, Nr4a2, Btg2, Eprs, Cd24a, Fos 
## Negative:  Lgals1, Cd52, S100a6, Crip1, S100a11, S100a4, Vim, S100a10, Tmsb4x, Sh3bgrl3 
##     Actb, Ahnak, Capg, Anxa2, Cd83, Serpinb6a, Arg1, Irf4, Lgals3, Tnfsf8 
##     Zeb2, Myadm, Il18rap, Ttc39c, Dusp4, Bcl2a1d, Pcsk1, Tnfsf4, Alcam, Csf2 
## PC_ 3 
## Positive:  Ramp3, Il5, Ctla2a, Tnfrsf9, Igsf5, Gm42418, Calca, Ell2, Srgn, Errfi1 
##     Bcl2a1d, Raph1, Ccl1, Crem, Isy1, Cd3g, Lmo4, Fth1, Il13, Tent5a 
##     Slc18a2, Palld, Hs3st1, Pdcd1, Irs2, Hif1a, Traf4, Rel, Rab27b, Zfyve16 
## Negative:  Zfp36, Jun, Dusp1, Fos, Hspa1b, Junb, Ltb, Ier2, Nfkbia, Hspa1a 
##     Txnip, Klf6, Hsp90aa1, Fosb, Nr4a1, Gpr183, Dnajb1, Cxcr4, Klf2, Gm17056 
##     Dnaja1, Hes1, Bmp2, Phlda1, Rhob, Lztfl1, Mns1, Btg2, Tcrg-C4, Rgs2 
## PC_ 4 
## Positive:  Gadd45b, Bhlhe40, Ccl1, Cd69, Fosb, Spry1, Srgn, Egr3, Pim1, Nr4a1 
##     Uhrf2, Csf2, Nfkbia, Il2, Map3k8, Nr4a2, Cxcl2, Plk3, Tnfsf11, Rora 
##     Malt1, Lif, Rgcc, Areg, Dennd4a, Sla, Mir155hg, Ifrd1, Tob2, Arg1 
## Negative:  Dnaja1, Txnip, Hsp90aa1, Tnfrsf9, Hspa1b, Id2, Hspa1a, Pcyt1a, Dnajb1, Isy1 
##     Gata3, Icos, Dhx40, Tent5a, Ly6a, Socs1, D16Ertd472e, Crem, Ddit4, Plin2 
##     Hist1h1e, Plaur, Kcnq1ot1, Cebpb, Kdm2b, Srxn1, Abcb1b, Dgat2, Dgat1, Cwc25 
## PC_ 5 
## Positive:  Cd3g, Fosb, Ctla2a, Klf6, Ipmk, Socs2, Gm26802, Fos, Hspa1b, Jun 
##     Ahnak, Lmna, Hs3st1, Rab27b, Rhob, Hist1h2bc, Dusp1, Ythdc1, Jund, Enah 
##     Txnip, Hsp90aa1, Hspa1a, Wwtr1, Dnajb1, Egr1, Tsc22d3, Gpr171, Stc2, Malat1 
## Negative:  Stab2, Cd83, Pxdc1, Icos, Gem, Gpr183, Tmsb4x, Ankrd33b, Lztfl1, Gm45716 
##     Bmp2, B3gnt2, Gadd45b, Igfbp4, Ikzf3, Ccr7, Tnfsf4, Ier5l, Arl5c, Cd70 
##     Gm43305, Ltb, Sema6d, Ccr4, Pim1, Nr4a1, Pde4b, Tnfsf9, Dgat2, Cxcr6
DimPlot(GEX.seur, reduction = "pca",dims = 1:2, group.by = "preAnno_1") +
  DimPlot(GEX.seur, reduction = "pca",dims = 3:4, group.by = "preAnno_1")

DimPlot(GEX.seur, reduction = "pca",dims = 1:2, group.by = "preAnno_2") +
  DimPlot(GEX.seur, reduction = "pca",dims = 3:4, group.by = "preAnno_2")

DimHeatmap(GEX.seur, dims = 1:24, cells = 1500, balanced = TRUE,ncol = 4)

decide PCs to use
ElbowPlot(GEX.seur,ndims = 24)

would take PCs 1-15 for next

PCs <- 1:15
GEX.seur <- FindNeighbors(GEX.seur, dims = PCs,k.param = 80)
## Computing nearest neighbor graph
## Computing SNN
GEX.seur <- FindClusters(GEX.seur,  resolution = 0.6, method = 'igraph')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9076
## Number of edges: 1371934
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7100
## Number of communities: 6
## Elapsed time: 5 seconds

tsne/umap

#GEX.seur <- RunTSNE(GEX.seur, dims=PCs, max_iter = 2000)   
#GEX.seur <- RunUMAP(GEX.seur, dims=PCs, n.neighbors = 80) 
DimPlot(GEX.seur, reduction = "tsne", label = T) + DimPlot(GEX.seur, reduction = "umap", label = T)

check the data

DimPlot(GEX.seur, reduction = "umap", group.by = "DoubletFinder0.1",label = F, repel = F, label.size = 3.5) +
  FeaturePlot(GEX.seur, reduction = "umap", features = "percent.mt")

FeaturePlot(GEX.seur, reduction = "umap", features = c("nFeature_RNA","nCount_RNA"))

VlnPlot(GEX.seur, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0.1, group.by = "SP.info")

VlnPlot(GEX.seur, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0.1, group.by = "FB.info")

DimPlot(GEX.seur, group.by = "FB.info", split.by = "FB.info", ncol = 4,
        cols = c("#05BE78","#5ECDF2","#739BDD","#B7E16F","#87C0C9","#7756FA","#9855B9","#6F9920"))

DimPlot(GEX.seur, group.by = "SP.info", split.by = "SP.info", ncol = 3)

check markers

DotPlot(GEX.seur, features = genes_to_check[genes_to_check %in% rownames(GEX.seur)],
             assay='RNA' , group.by = "seurat_clusters" )  + coord_flip()

genes_to_check <- c("Fcer1g","Klrb1c","Ncr1","Il22",
                    "Ccr6","Tbx21","Ifng","Eomes",
                    "Gata3","Il1rl1","Il13","Il17a",
                    "Il4","Il5","Klrg1","Rorc",
                    "Il7r","Thy1","Kdm6b","Ptprc",
                    "Cd3d","Cd3g","Cd4",
                    #"Cd8a",
                    "Trac","Trdc","Epcam","Pecam1","Siglech",
                    "Hist1h1b","Top2a","Mcm6","Mki67")

FeaturePlot(GEX.seur, features = genes_to_check, ncol = 4)

# 
genes_nature2017 <- c("Tbx21","Ifng","Ccl5","Il12rb1",
                      "Gata3","Il1rl1","Il5","Il13",
                      "Rorc","Il17a","Ahr","Il23r")

DotPlot(GEX.seur, features = genes_nature2017[genes_nature2017 %in% rownames(GEX.seur)],
             assay='RNA' , group.by = "seurat_clusters" )  + coord_flip()

FeaturePlot(GEX.seur, features = genes_nature2017, ncol = 4)

find markers

# find markers for every cluster compared to all remaining cells, report only the positive ones
#GEX.markers.ILC2_adv <- FindAllMarkers(GEX.seur, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.1,
#                                  test.use = "MAST")
GEX.markers.ILC2_adv <- read.table("GEX.markers.ILC2_adv.1110.csv", header = TRUE, sep = ",")

markers.top16 <- GEX.markers.ILC2_adv %>% group_by(cluster) %>% filter(avg_log2FC>0.25)  %>% top_n(n = 16, wt = avg_log2FC)
markers.top16
## # A tibble: 96 x 7
## # Groups:   cluster [6]
##       p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene   
##       <dbl>      <dbl> <dbl> <dbl>     <dbl>   <int> <chr>  
##  1 6.78e-71      0.333 1     1      1.02e-66       0 Gm42418
##  2 3.40e-69      0.438 0.635 0.570  5.13e-65       0 Gimap3 
##  3 3.54e-68      0.354 0.438 0.381  5.33e-64       0 Crebzf 
##  4 1.05e-63      0.390 0.35  0.268  1.59e-59       0 Fam122a
##  5 9.60e-59      0.295 0.706 0.711  1.45e-54       0 Tra2a  
##  6 4.90e-57      0.260 0.891 0.905  7.39e-53       0 Rps27rt
##  7 1.06e-56      0.276 0.439 0.412  1.60e-52       0 Snhg12 
##  8 1.38e-46      0.301 0.713 0.692  2.09e-42       0 Bcl2   
##  9 3.99e-46      0.271 0.71  0.7    6.01e-42       0 Cnot6l 
## 10 1.31e-45      0.282 0.406 0.359  1.97e-41       0 Herc4  
## # ... with 86 more rows
#GEX.markers.ILC2_pre %>% group_by(cluster) %>% filter(pct.1 > 0.2 & pct.2 <0.1) %>% top_n(n = 8, wt = avg_log2FC)
#GEX.markers.ILC2_pre %>% group_by(cluster) %>% top_n(n = 80, wt = avg_log2FC) %>% filter(gene %in% c("Top2a","Mki67") )
# 
DotPlot(GEX.seur, features = markers.top16[!duplicated(markers.top16$gene),"gene"],
             assay='RNA' , group.by = "seurat_clusters" )  + coord_flip()

FeaturePlot(GEX.seur, features = markers.top16$gene[1:96],
            reduction = 'umap')

VlnPlot(GEX.seur, features = markers.top16$gene[1:96],
            group.by = "seurat_clusters")

GEX.seur$Anno_ILC2 <- factor(paste0("ILC2_",as.character(GEX.seur$seurat_clusters)),
                             levels = paste0("ILC2_",c(0,4,1,5,2,3)))
marker.new <- c((markers.top16 %>% filter(cluster==0))$gene,
                (markers.top16 %>% filter(cluster==4))$gene,
                (markers.top16 %>% filter(cluster==1))$gene,
                (markers.top16 %>% filter(cluster==5))$gene,
                (markers.top16 %>% filter(cluster==2))$gene,
                (markers.top16 %>% filter(cluster==3))$gene)
p.heat_top16 <- DoHeatmap(GEX.seur , group.by = "Anno_ILC2", 
             features = marker.new) +
             NoLegend() + labs(title = "ILC2_newclusters top16\n")
p.heat_top16

marker.top80 <- c((GEX.markers.ILC2_adv %>% group_by(cluster) %>% filter(avg_log2FC >0.25) %>% top_n(n = 80, wt = avg_log2FC) %>% filter(cluster==0))$gene,
                (GEX.markers.ILC2_adv %>% group_by(cluster) %>% filter(avg_log2FC >0.25) %>% top_n(n = 80, wt = avg_log2FC) %>% filter(cluster==4))$gene,
                (GEX.markers.ILC2_adv %>% group_by(cluster) %>% filter(avg_log2FC >0.25) %>% top_n(n = 80, wt = avg_log2FC) %>% filter(cluster==1))$gene,
                (GEX.markers.ILC2_adv %>% group_by(cluster) %>% filter(avg_log2FC >0.25) %>% top_n(n = 80, wt = avg_log2FC) %>% filter(cluster==5))$gene,
                (GEX.markers.ILC2_adv %>% group_by(cluster) %>% filter(avg_log2FC >0.25) %>% top_n(n = 80, wt = avg_log2FC) %>% filter(cluster==2))$gene,
                (GEX.markers.ILC2_adv %>% group_by(cluster) %>% filter(avg_log2FC >0.25) %>% top_n(n = 80, wt = avg_log2FC) %>% filter(cluster==3))$gene)
p.heat_top80 <- DoHeatmap(GEX.seur , group.by = "Anno_ILC2", 
             features = marker.top80) +
             NoLegend() + labs(title = "ILC2_newclusters top80\n")+ theme( axis.text.y = element_text( size = 4 ))
p.heat_top80

condition comparison

DEGs

lapply(GEX.DEGs_adv,function(x) x %>% group_by(cluster) %>% top_n(n = 8, wt = avg_log2FC))
## $ILC2_0
## # A tibble: 18 x 8
## # Groups:   cluster [3]
##       p_val avg_log2FC pct.1 pct.2 p_val_adj cluster  gene     Anno_ILC2
##       <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>    <chr>    <chr>    
##  1 6.16e-14      0.530 0.49  0.376  9.29e-10 PBS      Gm47283  ILC2_0   
##  2 1.64e-11      0.403 0.804 0.702  2.48e- 7 PBS      Rbm3     ILC2_0   
##  3 7.68e- 9      0.329 0.947 0.878  1.16e- 4 PBS      S100a4   ILC2_0   
##  4 1.94e- 7      0.344 0.921 0.871  2.93e- 3 PBS      Nrip1    ILC2_0   
##  5 3.09e- 6      0.316 0.686 0.626  4.66e- 2 PBS      Gimap3   ILC2_0   
##  6 2.29e- 5      0.335 0.548 0.441  3.45e- 1 PBS      Nup98    ILC2_0   
##  7 1.17e- 4      0.377 0.472 0.378  1.00e+ 0 PBS      Arg1     ILC2_0   
##  8 9.23e- 3      0.312 0.496 0.437  1.00e+ 0 PBS      Tnfsf11  ILC2_0   
##  9 2.19e-14      0.277 0.998 0.992  3.30e-10 IL-33    Hsp90ab1 ILC2_0   
## 10 1.38e- 6      0.268 0.533 0.429  2.08e- 2 IL-33    Ltb4r1   ILC2_0   
## 11 2.47e- 6      0.262 0.826 0.763  3.72e- 2 IL-33    Tnfrsf18 ILC2_0   
## 12 4.48e- 4      0.278 0.669 0.589  1.00e+ 0 IL-33    Calca    ILC2_0   
## 13 8.85e- 4      0.253 0.612 0.548  1.00e+ 0 IL-33    Hsp90aa1 ILC2_0   
## 14 3.87e- 8      0.376 0.546 0.433  5.84e- 4 IL-33_DA Gm43305  ILC2_0   
## 15 6.54e- 7      0.350 0.536 0.492  9.87e- 3 IL-33_DA Odc1     ILC2_0   
## 16 7.48e- 5      0.254 0.502 0.443  1.00e+ 0 IL-33_DA St8sia4  ILC2_0   
## 17 9.94e- 5      0.308 0.586 0.528  1.00e+ 0 IL-33_DA Nr4a2    ILC2_0   
## 18 3.98e- 4      0.324 0.742 0.702  1.00e+ 0 IL-33_DA AY036118 ILC2_0   
## 
## $ILC2_4
## # A tibble: 24 x 8
## # Groups:   cluster [3]
##       p_val avg_log2FC pct.1 pct.2    p_val_adj cluster gene    Anno_ILC2
##       <dbl>      <dbl> <dbl> <dbl>        <dbl> <fct>   <chr>   <chr>    
##  1 6.31e-12      0.672 0.667 0.422 0.0000000952 PBS     Gm47283 ILC2_4   
##  2 3.63e- 5      0.676 0.456 0.344 0.548        PBS     Mxd1    ILC2_4   
##  3 3.48e- 4      0.450 0.825 0.798 1            PBS     Nrip1   ILC2_4   
##  4 4.21e- 4      1.35  0.246 0.166 1            PBS     Atf3    ILC2_4   
##  5 5.09e- 4      0.520 0.667 0.556 1            PBS     Cebpb   ILC2_4   
##  6 6.25e- 4      0.500 0.316 0.242 1            PBS     Snhg15  ILC2_4   
##  7 3.04e- 3      0.547 0.228 0.178 1            PBS     Ccl4    ILC2_4   
##  8 6.55e- 3      0.641 0.588 0.481 1            PBS     Hes1    ILC2_4   
##  9 4.21e- 6      0.551 0.431 0.296 0.0635       IL-33   Egr1    ILC2_4   
## 10 5.18e- 6      0.304 0.74  0.65  0.0781       IL-33   Ltb4r1  ILC2_4   
## # ... with 14 more rows
## 
## $ILC2_1
## # A tibble: 22 x 8
## # Groups:   cluster [3]
##       p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene     Anno_ILC2
##       <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>    <chr>    
##  1 3.03e-15      0.681 0.585 0.408  4.57e-11 PBS     Gm47283  ILC2_1   
##  2 6.60e- 8      0.504 0.88  0.819  9.96e- 4 PBS     Lmo4     ILC2_1   
##  3 6.65e- 8      0.526 0.76  0.648  1.00e- 3 PBS     Neurl3   ILC2_1   
##  4 4.15e- 7      0.375 0.975 0.984  6.26e- 3 PBS     Junb     ILC2_1   
##  5 2.53e- 6      0.395 0.69  0.644  3.82e- 2 PBS     Ptpn13   ILC2_1   
##  6 4.10e- 6      0.381 0.97  0.945  6.18e- 2 PBS     Zfp36l2  ILC2_1   
##  7 8.46e- 6      0.389 0.81  0.746  1.28e- 1 PBS     Vps37b   ILC2_1   
##  8 1.85e- 3      0.383 0.245 0.212  1.00e+ 0 PBS     Hist1h1c ILC2_1   
##  9 5.34e- 9      0.424 0.392 0.263  8.06e- 5 IL-33   Cd3g     ILC2_1   
## 10 1.57e- 8      0.251 0.7   0.569  2.37e- 4 IL-33   Hsp90aa1 ILC2_1   
## # ... with 12 more rows
## 
## $ILC2_5
## # A tibble: 24 x 8
## # Groups:   cluster [3]
##       p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene   Anno_ILC2
##       <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>  <chr>    
##  1 6.96e- 9      0.732 0.581 0.688  1.05e- 4 PBS     Stat4  ILC2_5   
##  2 3.39e- 7      0.818 0.452 0.17   5.11e- 3 PBS     Hexim1 ILC2_5   
##  3 7.16e- 5      0.817 0.581 0.548  1.00e+ 0 PBS     Spry1  ILC2_5   
##  4 1.21e- 4      0.701 0.903 0.868  1.00e+ 0 PBS     Itgav  ILC2_5   
##  5 1.06e- 3      1.07  0.839 0.609  1.00e+ 0 PBS     Ramp3  ILC2_5   
##  6 1.20e- 3      1.02  0.903 0.69   1.00e+ 0 PBS     Cxcl2  ILC2_5   
##  7 1.60e- 3      0.759 0.903 0.8    1.00e+ 0 PBS     Klf6   ILC2_5   
##  8 3.14e- 3      0.779 0.903 0.933  1.00e+ 0 PBS     Fos    ILC2_5   
##  9 5.71e-16      0.404 0.982 0.961  8.61e-12 IL-33   Coro1a ILC2_5   
## 10 2.24e-14      0.474 0.706 0.437  3.38e-10 IL-33   Mns1   ILC2_5   
## # ... with 14 more rows
## 
## $ILC2_2
## # A tibble: 19 x 8
## # Groups:   cluster [3]
##       p_val avg_log2FC pct.1 pct.2  p_val_adj cluster  gene     Anno_ILC2
##       <dbl>      <dbl> <dbl> <dbl>      <dbl> <fct>    <chr>    <chr>    
##  1 7.42e- 9      0.494 0.562 0.373 0.000112   PBS      Gm47283  ILC2_2   
##  2 2.17e- 6      0.332 0.857 0.767 0.0327     PBS      Rbm3     ILC2_2   
##  3 4.64e- 6      0.382 0.906 0.869 0.0700     PBS      Zfp36l2  ILC2_2   
##  4 1.04e- 5      0.329 0.888 0.875 0.157      PBS      S100a4   ILC2_2   
##  5 5.32e- 5      0.365 0.79  0.71  0.802      PBS      Hsp90aa1 ILC2_2   
##  6 8.28e- 5      0.336 0.513 0.455 1          PBS      Zcchc10  ILC2_2   
##  7 4.47e- 3      0.348 0.692 0.593 1          PBS      Hes1     ILC2_2   
##  8 7.13e- 3      0.357 0.201 0.174 1          PBS      Dgat2    ILC2_2   
##  9 8.22e- 5      0.371 0.751 0.679 1          IL-33    Csf2     ILC2_2   
## 10 5.26e- 4      0.361 0.654 0.567 1          IL-33    Cxcl2    ILC2_2   
## 11 3.33e- 3      0.311 0.411 0.33  1          IL-33    Ccrl2    ILC2_2   
## 12 2.88e-10      0.402 0.671 0.515 0.00000434 IL-33_DA Gm43305  ILC2_2   
## 13 9.04e- 6      0.275 0.411 0.344 0.136      IL-33_DA Jarid2   ILC2_2   
## 14 1.71e- 5      0.296 0.42  0.328 0.258      IL-33_DA Cd27     ILC2_2   
## 15 2.36e- 5      0.265 0.17  0.106 0.356      IL-33_DA Epcam    ILC2_2   
## 16 1.86e- 4      0.270 0.687 0.665 1          IL-33_DA Icos     ILC2_2   
## 17 3.81e- 4      0.280 0.705 0.63  1          IL-33_DA Ramp3    ILC2_2   
## 18 6.05e- 4      0.285 0.728 0.711 1          IL-33_DA AY036118 ILC2_2   
## 19 8.81e- 4      0.250 0.765 0.767 1          IL-33_DA Odc1     ILC2_2   
## 
## $ILC2_3
## # A tibble: 24 x 8
## # Groups:   cluster [3]
##       p_val avg_log2FC pct.1 pct.2     p_val_adj cluster gene          Anno_ILC2
##       <dbl>      <dbl> <dbl> <dbl>         <dbl> <fct>   <chr>         <chr>    
##  1 3.58e- 7      0.472 0.604 0.48  0.00540       PBS     Gm47283       ILC2_3   
##  2 8.70e- 6      0.413 0.732 0.69  0.131         PBS     4932438A13Rik ILC2_3   
##  3 2.47e- 5      0.351 0.47  0.363 0.372         PBS     Gm2000        ILC2_3   
##  4 5.36e- 4      0.409 0.336 0.24  1             PBS     Lpar6         ILC2_3   
##  5 5.86e- 4      0.376 0.356 0.256 1             PBS     Tuba4a        ILC2_3   
##  6 4.03e- 3      0.431 0.946 0.912 1             PBS     Eprs          ILC2_3   
##  7 6.11e- 3      0.333 0.43  0.37  1             PBS     Mast4         ILC2_3   
##  8 9.96e- 3      0.336 0.826 0.763 1             PBS     Cebpb         ILC2_3   
##  9 1.31e-13      0.452 0.754 0.552 0.00000000197 IL-33   Ltb4r1        ILC2_3   
## 10 3.74e- 6      0.312 0.69  0.554 0.0563        IL-33   Sh2d2a        ILC2_3   
## # ... with 14 more rows
lapply(GEX.DEGs_adv,dim)
## $ILC2_0
## [1] 23  8
## 
## $ILC2_4
## [1] 75  8
## 
## $ILC2_1
## [1] 68  8
## 
## $ILC2_5
## [1] 421   8
## 
## $ILC2_2
## [1] 29  8
## 
## $ILC2_3
## [1] 59  8
vln_DEGs.adv <- list()
Idents(GEX.seur) <- "SP.info"
for(nn in names_adv){
  pp.test <- VlnPlot(subset(GEX.seur, subset = Anno_ILC2 == nn), 
               features = (GEX.DEGs_adv[[nn]] %>% group_by(cluster) %>% top_n(n = 8, wt = avg_log2FC))$gene,
               combine = F)
  title <- cowplot::ggdraw() + cowplot::draw_label(nn, fontface = 'bold', size = 9)
  for(pp in 1:length(pp.test)){
    pp.test[[pp]] <- pp.test[[pp]] + NoLegend()
  }
  vln_DEGs.adv[[nn]] <- cowplot::plot_grid(title = title,plotlist =  pp.test, ncol = 4)
}
Idents(GEX.seur) <- "Anno_ILC2"
top 8 vlnplot
## $ILC2_0

## 
## $ILC2_4

## 
## $ILC2_1

## 
## $ILC2_5

## 
## $ILC2_2

## 
## $ILC2_3

cell composition

FB.info

Idents(GEX.seur) <- "Anno_ILC2"
DimPlot(GEX.seur, reduction = "umap", label = F,group.by = "FB.info",
        cols = c("#05BE78","#5ECDF2","#739BDD","#B7E16F","#87C0C9","#7756FA","#9855B9","#6F9920"))+
  DimPlot(GEX.seur, reduction = "umap", label = T)

DimPlot(GEX.seur, group.by = "FB.info", split.by = "FB.info", ncol = 4,
        cols = c("#05BE78","#5ECDF2","#739BDD","#B7E16F","#87C0C9","#7756FA","#9855B9","#6F9920"))

table(data.frame(GEX.seur$FB.info,GEX.seur$Anno_ILC2))
##                 GEX.seur.Anno_ILC2
## GEX.seur.FB.info ILC2_0 ILC2_4 ILC2_1 ILC2_5 ILC2_2 ILC2_3
##         hashtag1    101     30     26      8     82     64
##         hashtag2    240     84    174     23    142     85
##         hashtag3    239    212    389    327    203    115
##         hashtag4    266    147    304     97    168    110
##         hashtag5    313    175    413    178    256    181
##         hashtag6    378    214    197     84    236    143
##         hashtag7    303     73    259    170    188    204
##         hashtag8    449    136    172     26    396    296
pheatmap::pheatmap(table(data.frame(GEX.seur$FB.info,GEX.seur$Anno_ILC2)),
                   main = "Cell Count",
                   cluster_rows = F, cluster_cols = F, display_numbers = T, number_format = "%.0f")

pheatmap::pheatmap(100*rowRatio(table(data.frame(GEX.seur$FB.info,GEX.seur$Anno_ILC2))),
                   main = "Percentage of Sample count(row)",
                   cluster_rows = F, cluster_cols = F, display_numbers = T, number_format = "%.2f")

SP.info

DimPlot(GEX.seur, reduction = "umap", label = F,group.by = "SP.info",
        cols = c("#76797C","#0000C8","#FDB911"))+
  DimPlot(GEX.seur, reduction = "umap", label = T)

DimPlot(GEX.seur, group.by = "SP.info", split.by = "SP.info",cols = c("#76797C","#0000C8","#FDB911"), ncol = 3)

table(data.frame(GEX.seur$SP.info,GEX.seur$Anno_ILC2))
##                 GEX.seur.Anno_ILC2
## GEX.seur.SP.info ILC2_0 ILC2_4 ILC2_1 ILC2_5 ILC2_2 ILC2_3
##         PBS         341    114    200     31    224    149
##         IL-33       818    534   1106    602    627    406
##         IL-33_DA   1130    423    628    280    820    643
pheatmap::pheatmap(table(data.frame(GEX.seur$SP.info,GEX.seur$Anno_ILC2)),
                   main = "Cell Count",
                   cluster_rows = F, cluster_cols = F, display_numbers = T, number_format = "%.0f")

pheatmap::pheatmap(100*rowRatio(table(data.frame(GEX.seur$SP.info,GEX.seur$Anno_ILC2))),
                   main = "Percentage of Sample count(row)",
                   cluster_rows = F, cluster_cols = F, display_numbers = T, number_format = "%.2f")

topic model

basically follow https://bitbucket.org/jerry00/mouse_small_intestine_immune_cell_atlas/src/master/script/topicModeling_PP.R

source("../analysis_1101/topicmodel.r")

calculate topics

# using umap coordinates
GEX.seur@reductions[['umap']]@cell.embeddings[1:5,1:2]
##                        UMAP_1     UMAP_2
## AAACCCAAGCATCCTA-1 -1.1394443  3.0657358
## AAACCCAAGGGCAACT-1 -3.0438967 -0.2278638
## AAACCCAAGGTGCCAA-1 -3.9796829 -1.0369316
## AAACCCACAGAAGTGC-1 -0.7091646 -1.2534755
## AAACCCAGTAGGGTAC-1  4.3127932  1.5042106
# to exclude ribosomal
ribo.loc <- grepl("Rpl|Rps", rownames(GEX.seur@assays[['RNA']]@counts))
head(rownames(GEX.seur@assays[['RNA']]@counts)[ribo.loc])
## [1] "Rpl7"    "Rpl31"   "Rpl37a"  "Rps6kc1" "Rpl7a"   "Rpl12"
sum(ribo.loc)
## [1] 96
# Build topic models and generate assocaited figures 
# Recommended to build models for k = 3,4,5,6,7,8,9,10 and tolerance 0.1 (set as inputs to this function)
# each K would take hours, bigger K, longer time
# so slow, would run into three sub-scripts from 3 to 10
#for(kk in c(3,4,5,6,7,8,9,10)){
#  kkk <- kk
#  if(kk %in% c(4,6,8)){
#    kkk <- paste0(0,kk)
#  }
#  FitGoM(t(as.matrix(GEX.seur@assays[['RNA']]@counts)[!ribo.loc,]),
#         K = kk, tol = 0.1,
#         path_rda =paste0("topicmodel/test_topic.n",kkk,"_t0.1.rda"))
#}